{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Linear Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = pd.read_csv('../data/weight-height.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df.plot(kind='scatter',\n",
    "        x='Height',\n",
    "        y='Weight',\n",
    "        title='Weight and Height in adults')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df.plot(kind='scatter',\n",
    "        x='Height',\n",
    "        y='Weight',\n",
    "        title='Weight and Height in adults')\n",
    "\n",
    "# Here we're plotting the red line 'by hand' with fixed values\n",
    "# We'll try to learn this line with an algorithm below\n",
    "plt.plot([55, 78], [75, 250], color='red', linewidth=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def line(x, w=0, b=0):\n",
    "    return x * w + b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "x = np.linspace(55, 80, 100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "yhat = line(x, w=0, b=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "yhat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df.plot(kind='scatter',\n",
    "        x='Height',\n",
    "        y='Weight',\n",
    "        title='Weight and Height in adults')\n",
    "plt.plot(x, yhat, color='red', linewidth=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Cost Function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def mean_squared_error(y_true, y_pred):\n",
    "    s = (y_true - y_pred)**2\n",
    "    return s.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = df[['Height']].values\n",
    "y_true = df['Weight'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_true"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred = line(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "mean_squared_error(y_true, y_pred.ravel())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### you do it!\n",
    "\n",
    "Try changing the values of the parameters b and w in the line above and plot it again to see how the plot and the cost  change."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10, 5))\n",
    "\n",
    "# we are going to draw 2 plots in the same figure\n",
    "# first plot, data and a few lines\n",
    "ax1 = plt.subplot(121)\n",
    "df.plot(kind='scatter',\n",
    "        x='Height',\n",
    "        y='Weight',\n",
    "        title='Weight and Height in adults', ax=ax1)\n",
    "\n",
    "# let's explore the cost function for a few values of b between -100 and +150\n",
    "bbs = np.array([-100, -50, 0, 50, 100, 150])\n",
    "mses = []  # we will append the values of the cost here, for each line\n",
    "for b in bbs:\n",
    "    y_pred = line(X, w=2, b=b)\n",
    "    mse = mean_squared_error(y_true, y_pred)\n",
    "    mses.append(mse)\n",
    "    plt.plot(X, y_pred)\n",
    "\n",
    "# second plot: Cost function\n",
    "ax2 = plt.subplot(122)\n",
    "plt.plot(bbs, mses, 'o-')\n",
    "plt.title('Cost as a function of b')\n",
    "plt.xlabel('b')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Linear Regression with Keras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from keras.models import Sequential\n",
    "from keras.layers import Dense\n",
    "from keras.optimizers import Adam, SGD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model = Sequential()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.add(Dense(1, input_shape=(1,)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.compile(Adam(lr=0.8), 'mean_squared_error')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.fit(X, y_true, epochs=40)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred = model.predict(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df.plot(kind='scatter',\n",
    "        x='Height',\n",
    "        y='Weight',\n",
    "        title='Weight and Height in adults')\n",
    "plt.plot(X, y_pred, color='red')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "W, B = model.get_weights()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "W"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "B"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluating Model Performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import r2_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print(\"The R2 score is {:0.3f}\".format(r2_score(y_true, y_pred)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train Test Split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y_true,\n",
    "                                                    test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "len(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "len(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "W[0, 0] = 0.0\n",
    "B[0] = 0.0\n",
    "model.set_weights((W, B))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.fit(X_train, y_train, epochs=50, verbose=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_train_pred = model.predict(X_train).ravel()\n",
    "y_test_pred = model.predict(X_test).ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import mean_squared_error as mse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print(\"The Mean Squared Error on the Train set is:\\t{:0.1f}\".format(mse(y_train, y_train_pred)))\n",
    "print(\"The Mean Squared Error on the Test set is:\\t{:0.1f}\".format(mse(y_test, y_test_pred)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print(\"The R2 score on the Train set is:\\t{:0.3f}\".format(r2_score(y_train, y_train_pred)))\n",
    "print(\"The R2 score on the Test set is:\\t{:0.3f}\".format(r2_score(y_test, y_test_pred)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = pd.read_csv('../data/user_visit_duration.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df.plot(kind='scatter', x='Time (min)', y='Buy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model = Sequential()\n",
    "model.add(Dense(1, input_shape=(1,), activation='sigmoid'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.compile(SGD(lr=0.5), 'binary_crossentropy', metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = df[['Time (min)']].values\n",
    "y = df['Buy'].values\n",
    "\n",
    "model.fit(X, y, epochs=25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ax = df.plot(kind='scatter', x='Time (min)', y='Buy',\n",
    "             title='Purchase behavior VS time spent on site')\n",
    "\n",
    "temp = np.linspace(0, 4)\n",
    "ax.plot(temp, model.predict(temp), color='orange')\n",
    "plt.legend(['model', 'data'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "temp_class = model.predict(temp) > 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ax = df.plot(kind='scatter', x='Time (min)', y='Buy',\n",
    "             title='Purchase behavior VS time spent on site')\n",
    "\n",
    "temp = np.linspace(0, 4)\n",
    "ax.plot(temp, temp_class, color='orange')\n",
    "plt.legend(['model', 'data'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred = model.predict(X)\n",
    "y_class_pred = y_pred > 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print(\"The accuracy score is {:0.3f}\".format(accuracy_score(y, y_class_pred)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train/Test split\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "params = model.get_weights()\n",
    "params = [np.zeros(w.shape) for w in params]\n",
    "model.set_weights(params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print(\"The accuracy score is {:0.3f}\".format(accuracy_score(y, model.predict(X) > 0.5)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.fit(X_train, y_train, epochs=25, verbose=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print(\"The train accuracy score is {:0.3f}\".format(accuracy_score(y_train, model.predict(X_train) > 0.5)))\n",
    "print(\"The test accuracy score is {:0.3f}\".format(accuracy_score(y_test, model.predict(X_test) > 0.5)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cross Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from keras.wrappers.scikit_learn import KerasClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def build_logistic_regression_model():\n",
    "    model = Sequential()\n",
    "    model.add(Dense(1, input_shape=(1,), activation='sigmoid'))\n",
    "    model.compile(SGD(lr=0.5),\n",
    "                  'binary_crossentropy',\n",
    "                  metrics=['accuracy'])\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model = KerasClassifier(build_fn=build_logistic_regression_model,\n",
    "                        epochs=25,\n",
    "                        verbose=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score, KFold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "cv = KFold(3, shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "scores = cross_val_score(model, X, y, cv=cv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print(\"The cross validation accuracy is {:0.4f} ± {:0.4f}\".format(scores.mean(), scores.std()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Confusion Matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import confusion_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "confusion_matrix(y, y_class_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def pretty_confusion_matrix(y_true, y_pred, labels=[\"False\", \"True\"]):\n",
    "    cm = confusion_matrix(y_true, y_pred)\n",
    "    pred_labels = ['Predicted '+ l for l in labels]\n",
    "    df = pd.DataFrame(cm, index=labels, columns=pred_labels)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pretty_confusion_matrix(y, y_class_pred, ['Not Buy', 'Buy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import precision_score, recall_score, f1_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print(\"Precision:\\t{:0.3f}\".format(precision_score(y, y_class_pred)))\n",
    "print(\"Recall:  \\t{:0.3f}\".format(recall_score(y, y_class_pred)))\n",
    "print(\"F1 Score:\\t{:0.3f}\".format(f1_score(y, y_class_pred)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import classification_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print(classification_report(y, y_class_pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Preprocessing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Categorical Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = pd.read_csv('../data/weight-height.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df['Gender'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pd.get_dummies(df['Gender'], prefix='Gender').head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Transformations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 1) Rescale with fixed factor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df['Height (feet)'] = df['Height']/12.0\n",
    "df['Weight (100 lbs)'] = df['Weight']/100.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df.describe().round(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### MinMax normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "mms = MinMaxScaler()\n",
    "df['Weight_mms'] = mms.fit_transform(df[['Weight']])\n",
    "df['Height_mms'] = mms.fit_transform(df[['Height']])\n",
    "df.describe().round(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3) Standard normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "ss = StandardScaler()\n",
    "df['Weight_ss'] = ss.fit_transform(df[['Weight']])\n",
    "df['Height_ss'] = ss.fit_transform(df[['Height']])\n",
    "df.describe().round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "plt.figure(figsize=(15, 5))\n",
    "\n",
    "for i, feature in enumerate(['Height', 'Height (feet)', 'Height_mms', 'Height_ss']):\n",
    "    plt.subplot(1, 4, i+1)\n",
    "    df[feature].plot(kind='hist', title=feature)\n",
    "    plt.xlabel(feature)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Machine Learning Exercises"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise 1\n",
    "\n",
    "You've just been hired at a real estate investment firm and they would like you to build a model for pricing houses. You are given a dataset that contains data for house prices and a few features like number of bedrooms, size in square feet and age of the house. Let's see if you can build a model that is able to predict the price. In this exercise we extend what we have learned about linear regression to a dataset with more than one feature. Here are the steps to complete it:\n",
    "\n",
    "1. Load the dataset ../data/housing-data.csv\n",
    "- plot the histograms for each feature\n",
    "- create 2 variables called X and y: X shall be a matrix with 3 columns (sqft,bdrms,age) and y shall be a vector with 1 column (price)\n",
    "- create a linear regression model in Keras with the appropriate number of inputs and output\n",
    "- split the data into train and test with a 20% test size\n",
    "- train the model on the training set and check its accuracy on training and test set\n",
    "- how's your model doing? Is the loss growing smaller?\n",
    "- try to improve your model with these experiments:\n",
    "    - normalize the input features with one of the rescaling techniques mentioned above\n",
    "    - use a different value for the learning rate of your model\n",
    "    - use a different optimizer\n",
    "- once you're satisfied with training, check the R2score on the test set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise 2\n",
    "\n",
    "Your boss was extremely happy with your work on the housing price prediction model and decided to entrust you with a more challenging task. They've seen a lot of people leave the company recently and they would like to understand why that's happening. They have collected historical data on employees and they would like you to build a model that is able to predict which employee will leave next. They would like a model that is better than random guessing. They also prefer false negatives than false positives, in this first phase. Fields in the dataset include:\n",
    "\n",
    "- Employee satisfaction level\n",
    "- Last evaluation\n",
    "- Number of projects\n",
    "- Average monthly hours\n",
    "- Time spent at the company\n",
    "- Whether they have had a work accident\n",
    "- Whether they have had a promotion in the last 5 years\n",
    "- Department\n",
    "- Salary\n",
    "- Whether the employee has left\n",
    "\n",
    "Your goal is to predict the binary outcome variable `left` using the rest of the data. Since the outcome is binary, this is a classification problem. Here are some things you may want to try out:\n",
    "\n",
    "1. load the dataset at ../data/HR_comma_sep.csv, inspect it with `.head()`, `.info()` and `.describe()`.\n",
    "- Establish a benchmark: what would be your accuracy score if you predicted everyone stay?\n",
    "- Check if any feature needs rescaling. You may plot a histogram of the feature to decide which rescaling method is more appropriate.\n",
    "- convert the categorical features into binary dummy columns. You will then have to combine them with the numerical features using `pd.concat`.\n",
    "- do the usual train/test split with a 20% test size\n",
    "- play around with learning rate and optimizer\n",
    "- check the confusion matrix, precision and recall\n",
    "- check if you still get the same results if you use a 5-Fold cross validation on all the data\n",
    "- Is the model good enough for your boss?\n",
    "\n",
    "As you will see in this exercise, the a logistic regression model is not good enough to help your boss. In the next chapter we will learn how to go beyond linear models.\n",
    "\n",
    "This dataset comes from https://www.kaggle.com/ludobenistant/hr-analytics/ and is released under [CC BY-SA 4.0 License](https://creativecommons.org/licenses/by-sa/4.0/)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
